Loading libraries and set directories

library(dplyr)
library(apeglm)
library(org.Hs.eg.db)
library(Seurat)
library(sctransform)
library(SeuratObject)
library(DESeq2)
library(enrichplot)
library(clusterProfiler)
library(AnnotationDbi)
library(SingleCellExperiment)
library(muscat)
library(ggplot2)
library(gprofiler2)
library(ggrepel)
library(msigdbr)


MEDIApath="/home/mclaudia/MEDIA Project/"
MEDIAgraph="/home/mclaudia/MEDIA Project/Codici_finali/Paper_figures/"
MEDIAsave="/home/mclaudia/MEDIA Project/Codici_finali/"
wd="/home/mclaudia/MEDIA Project/GSE152805_RAW_scell/"

SingleCell Dataset from Chou et al., 2020 (doi:10.1038/s41598-020-67730-y): Dataset 1

Build merged Seurat Object of the three patients: s113, s116, s118

MT: medial region of the tibia (affected cartilage)

OLT: outer lateral region of the tibia (healthy cartilage)

MT1.data <-Read10X(data.dir =paste0(wd,"MT_113"))
MT1<- CreateSeuratObject(counts = MT1.data, 
                         project = "MT1", min.cells = 300, min.features = 500)

MT2.data <-Read10X(data.dir = paste0(wd,"MT_116"))
MT2<- CreateSeuratObject(counts = MT2.data, 
                         project = "MT2", min.cells = 300, min.features = 500)

MT3.data <-Read10X(data.dir = paste0(wd,"MT_118"))
MT3<- CreateSeuratObject(counts = MT3.data, 
                         project = "MT3", min.cells = 300, min.features = 500)

olt1.data <-Read10X(data.dir = paste0(wd,"olt_113"))
olt1<- CreateSeuratObject(counts = olt1.data, 
                          project = "olt1", min.cells = 300, min.features = 500)

olt2.data <-Read10X(data.dir = paste0(wd,"olt_116"))
olt2<- CreateSeuratObject(counts = olt2.data, 
                          project = "olt2", min.cells = 300, min.features = 500)

olt3.data <-Read10X(data.dir = paste0(wd,"olt_118"))
olt3<- CreateSeuratObject(counts = olt3.data, 
                          project = "olt3", min.cells = 300, min.features = 500)

tot113 <- merge(MT1, y = olt1, add.cell.ids = c("MT_s113", "Olt_s113"), project = "tot113")
length(Cells(tot113))

tot116 <- merge(MT2, y = olt2, add.cell.ids = c("MT_s116", "Olt_s116"), project = "tot116")
length(Cells(tot116))

tot118 <- merge(MT3, y = olt3, add.cell.ids = c("MT_s118", "Olt_s118"), project = "tot118")
length(Cells(tot118))

pt_list <-list(tot113,tot116,tot118)
names(pt_list) <-c("tot113","tot116","tot118")

Pre-processing

for( i in 1:length(pt_list)){
  
  counts <- GetAssayData(object = pt_list[[i]], slot = "counts")
  
  nonzero <- counts > 0
  keep_genes <- Matrix::rowSums(nonzero) >= 100
  filtered_counts <- counts[keep_genes, ]
  filtered_seurat <- CreateSeuratObject(filtered_counts, meta.data = pt_list[[i]]@meta.data)
  
  tmp <-filtered_seurat
  
  tmp[["percent.mt"]] <- PercentageFeatureSet(tmp , pattern = "^MT-")

  VlnPlot(tmp, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
  
  plot1 <- FeatureScatter(tmp, feature1 = "nCount_RNA", feature2 = "percent.mt")
  plot2 <- FeatureScatter(tmp, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
 # plot1 + plot2, visualize to set nFeature_RNA, nCount_RNA and percent.mt thresholds
  
  if(i == 1) tmp <- subset(tmp, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 4 & nCount_RNA < 50000)
  if(i == 2) tmp <- subset(tmp, subset = nFeature_RNA > 200 & nFeature_RNA < 4000 & percent.mt < 5 & nCount_RNA < 50000)
  if(i == 3) tmp <- subset(tmp, subset = nFeature_RNA > 200 & nFeature_RNA < 6000 & percent.mt < 3 & nCount_RNA < 40000)
  
  tmp<- NormalizeData(tmp)
  tmp1<- FindVariableFeatures(tmp, selection.method = "vst", nfeatures = 8500)
  pt_list[[i]] <-tmp1
}

Integrate patient data

features <- SelectIntegrationFeatures(pt_list, nfeatures = 8500)

anchors <- FindIntegrationAnchors(pt_list, anchor.features=features)

OAtot <-IntegrateData(anchors, new.assay.name = "integrate")

Examine and visualize PCA

all.genes <-  rownames(OAtot)

OAtot <- ScaleData(OAtot, features = all.genes)
OAtot <- RunPCA(OAtot, features = VariableFeatures(object = OAtot))
print(OAtot[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1 
## Positive:  S100A4, COL1A2, CRTAC1, HTRA1, COL3A1 
## Negative:  FRZB, CFH, JUNB, ACAN, COL9A3 
## PC_ 2 
## Positive:  CHI3L2, NNMT, CHI3L1, CLU, RPL13 
## Negative:  FGFBP2, S100A1, SCRG1, C2orf82, CHAD 
## PC_ 3 
## Positive:  COMP, OMD, PRELP, CILP, SMOC2 
## Negative:  LMNA, COL1A1, VIM, NBL1, B2M 
## PC_ 4 
## Positive:  IBSP, COL10A1, SPP1, MT-ND3, SOD3 
## Negative:  COL2A1, EEF1A1, RPL3, CILP2, RPS2 
## PC_ 5 
## Positive:  RPL23A, RPS16, RPL37, RPL37A, RPLP2 
## Negative:  CLU, ITM2B, LUM, DCN, PDIA3

Run UMAP: 10 PC chosen

VizDimLoadings(OAtot, dims = 1:2, reduction = "pca")

DimPlot(OAtot, reduction = "pca")

DimHeatmap(OAtot, dims = 1:15, cells = 500, balanced = TRUE)

ElbowPlot(OAtot) ###9-12 PC

OAtot <- FindNeighbors(OAtot, dims = 1:10)
OAtot <- FindClusters(OAtot, resolution = 0.5) 
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 23752
## Number of edges: 690120
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8396
## Number of communities: 9
## Elapsed time: 12 seconds
OAtot <- RunUMAP(OAtot, dims = 1:10)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

Save UMAP coordinates for reproducibility

umapCoord <- Embeddings(object = OAtot[["umap"]])
save(umapCoord,file=paste0(MEDIAsave,"CoordUMAP.RData"))

Visualize UMAP for: clusterization, class, single patient

load(paste0(MEDIAsave,"CoordUMAP.RData"))
OAtot@meta.data$type <-sapply(strsplit(rownames(OAtot@meta.data),"_"), function(x) x[1])
OAtot@meta.data$patient <-sapply(strsplit(rownames(OAtot@meta.data),"_"), function(x) x[2])

OAtot@reductions[["umap"]]@cell.embeddings  <- umapCoord

DimPlot(OAtot, reduction = "umap", label = TRUE, label.size = 7)+
   ggtitle("Clustering")+
   theme(plot.title = element_text(hjust = 0.5))

DimPlot(OAtot, reduction = "umap", group.by = "type", label = TRUE, 
        label.size = 7,cols= c("turquoise","orangered"))+
        ggtitle("Class")

DimPlot(OAtot, reduction = "umap", group.by = "patient")+
        ggtitle("Patient")

Pseudo-bulk singleCell data trasformation

ei <-OAtot@meta.data[,c(1,6,7,8)]

head(ei) 
##                            orig.ident seurat_clusters type patient
## MT_s113_AAACCTGGTAGCACGA-1        MT1               2   MT    s113
## MT_s113_AAACGGGAGGCAGGTT-1        MT1               2   MT    s113
## MT_s113_AAACGGGCAAATACAG-1        MT1               2   MT    s113
## MT_s113_AAACGGGCAAGAGTCG-1        MT1               0   MT    s113
## MT_s113_AAACGGGCACCGCTAG-1        MT1               0   MT    s113
## MT_s113_AAACGGGCAGTCTTCC-1        MT1               0   MT    s113
ei$clus <-0  #merge in unique

OA_sce <-as.SingleCellExperiment(OAtot, assay = "RNA") 
OA_sce$clus <-0

(OA_sce2 <- prepSCE(OA_sce, 
                    kid = "seurat_clusters",
                    gid = "type",  
                    sid = "orig.ident",   
                    drop = FALSE))  
## class: SingleCellExperiment 
## dim: 10060 23752 
## metadata(1): experiment_info
## assays(2): counts logcounts
## rownames(10060): AP006222.2 SAMD11 ... LCA5L LINC00205
## rowData names(0):
## colnames(23752): MT_s113_AAACCTGGTAGCACGA-1 MT_s113_AAACGGGAGGCAGGTT-1
##   ... Olt_s118_TTTGTCAGTGCACGAA-1 Olt_s118_TTTGTCATCCTAAGTG-1
## colData names(10): cluster_id sample_id ... ident clus
## reducedDimNames(2): PCA UMAP
## mainExpName: RNA
## altExpNames(0):
OA_sce2$cond <-paste0(OA_sce2$sample_id,"_",OA_sce2$patient)

experiment_info <- data.frame(sample_id=as.factor(c("MT1","MT2","MT3","olt1","olt2","olt3")),
                              cond=as.factor(c("MT1_s113 ","MT2_s116",
                                               "MT3_s118","olt1_s113","olt2_s116","olt3_s118")),
                              patient=as.factor(rep(c("s113","s116","s118"),2)),
                              group_id=as.factor(c("MT","MT","MT","Olt","Olt","Olt")),
                              n_cells=as.numeric(table(ei$orig.ident)))


OA_sce2@metadata <- list(experiment_info)
names(OA_sce2@metadata) <-"experiment_info"


table(OA_sce2$cluster_id,OA_sce2$sample_id)
##    
##      MT1  MT2  MT3 olt1 olt2 olt3
##   0 1733  903 2477  223  321  325
##   1   64   27   62 2260 1135 2037
##   2  897  442 1189  101   67   70
##   3    2   13   81 1350  933  378
##   4    0    0   12  858  895  633
##   5   18  134   42 1115  259  474
##   6  243  693  377   16    5   66
##   7   51  486    4    2    0   20
##   8   22    0   22  155   46   14
pb <- aggregateData(OA_sce2, assay = "counts", fun = "sum")
colData(pb)
## DataFrame with 6 rows and 4 columns
##      group_id     patient      clus        cond
##      <factor> <character> <numeric> <character>
## MT1       MT         s113         0    MT1_s113
## MT2       MT         s116         0    MT2_s116
## MT3       MT         s118         0    MT3_s118
## olt1      Olt        s113         0   olt1_s113
## olt2      Olt        s116         0   olt2_s116
## olt3      Olt        s118         0   olt3_s118
plotPCA(OA_sce2, colour_by = "group_id")

pbMDS(pb)

colData(pb)$patient <-as.factor(colData(pb)$patient)
colData(pb)$cond <-as.factor(colData(pb)$cond)

ei <- metadata(pb)$experiment_info

pb$group_id <-relevel(pb$group_id,ref="Olt")
ei$group_id <-relevel(ei$group_id,ref="Olt")


ass <-matrix(0, nrow = nrow(pb@int_elementMetadata), ncol = 6)
colnames(ass) <-c("MT1","MT2","MT3","olt1","olt2","olt3")

for(i in c(1:9)){
  as <-as.data.frame(as.matrix(pb@assays@data@listData[[i]]))
  ass <-ass+as
}

ass <- ass+1  #add dummy value = 1

cd <- colData(pb)
y <- DESeqDataSetFromMatrix(ass, cd, design=~patient+group_id)

dds <- DESeq(y)
resultsNames(dds)
## [1] "Intercept"            "patient_s116_vs_s113" "patient_s118_vs_s113"
## [4] "group_id_MT_vs_Olt"
res <-results(dds,alpha=.9,name="group_id_MT_vs_Olt",pAdjustMethod = "fdr")

res3 <-lfcShrink(dds, coef="group_id_MT_vs_Olt",res =res,type = "apeglm")
res3 <- as.data.frame(res3)
res3 <-res3[order(res3$padj, decreasing = FALSE),]

res3[1:10,]
##         baseMean log2FoldChange      lfcSE       pvalue         padj
## COL1A1 12787.080      14.356062 0.83122461 1.311834e-77 1.319705e-73
## ABI3BP  9863.181       1.605177 0.09261043 2.269700e-68 1.141659e-64
## TGFBI  12216.317       2.863385 0.16669522 7.913243e-68 2.653574e-64
## TSPAN2 13968.174       1.685722 0.10144715 3.349151e-63 8.423114e-60
## PDLIM7  2412.373       2.232388 0.14327520 4.262484e-56 8.576118e-53
## CRLF1   3719.325      12.742026 0.81759672 2.333293e-52 3.912155e-49
## MT1G   77050.541       1.863951 0.12565511 5.386437e-51 7.741079e-48
## CRTAC1 46489.316       2.173891 0.14739598 1.417025e-50 1.781909e-47
## KLF10   6336.885      -1.184766 0.08453508 9.633670e-46 1.076830e-42
## CLIC3   1633.566      11.641215 0.81663338 4.616620e-44 4.644320e-41

Save files

save(res3,file=paste0(MEDIAsave,"DE_DESeq2_SCell.RData"))

Volcano Plot of DE genes

annot <-res3[,c(2,5)]
annot$gene_names <-rownames(res3)

annot$col <-"gray50"
annot[annot$log2FoldChange <= -9 & annot$padj <=0.05, "col"] <- "springgreen3"
annot[annot$log2FoldChange  >= 9 & annot$padj <=0.05, "col"] <- "red"

annot$label <- NA
annot[annot$log2FoldChange  >= 10 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange  >= 10 & annot$padj <=0.05, "gene_names"]
annot[annot$log2FoldChange  <= -10 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange   <= -10 & annot$padj <=0.05, "gene_names"]

ggplot(data=annot, aes(x=log2FoldChange, y=-log10(padj), color=col,label=label)) +
  geom_point(size=1) +
  geom_label_repel(size=4,
                   min.segment.length = 0,direction = "both",
                   max.overlaps =40,
                   color="black",
                   segment.linetype=2) +
  scale_shape_manual(values=c(8, 19))+
  scale_color_manual(values = c("red","gray50","springgreen3"),
                     breaks = c("red","gray50","springgreen3"),
                     labels = c("UP","notDE","DOWN")) +
  labs(color="Legend")+
  xlab("log2(FC)")+
  ylab("-log10(FDR)")+
  theme_bw()+
  ggtitle("Volcano plot of DE genes - Dataset 1")+
  geom_vline(xintercept=c(-9, 9), col="gray70", linetype=2)+
  geom_hline(yintercept=-log10(0.05), col="gray70", linetype=2)

Enrichment with GSEA: BP and REACTOME

hs=get("org.Hs.eg.db")

genes1 <- annot[, c("log2FoldChange","gene_names")]
genes <- genes1$log2FoldChange
names(genes) <-genes1$gene_names

genes <-sort(genes, decreasing = TRUE)

gsebp <- gseGO(geneList=genes, 
               ont ="BP", 
               keyType = "SYMBOL", 
               pvalueCutoff = 0.01,
               eps = 0,
               verbose = TRUE, 
               OrgDb = hs, 
               pAdjustMethod = "fdr")

dotplot(gsebp, showCategory = 20, x="NES",split=".sign") + facet_grid(.~.sign)

rc <-msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")
sig <-rc[,c(3,4)]
sig$gs_name <-gsub("^REACTOME_","",sig$gs_name)

gsreac <- clusterProfiler::GSEA(genes,TERM2GENE = sig,
                             eps = 0,
                             verbose = TRUE, 
                             minGSSize =5, pAdjustMethod = "fdr",
                             pvalueCutoff =0.1)
dotplot(gsreac, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)+
  theme(axis.text.y = element_text(size = 8))